df_artist <- read.csv("data/df_artist_sin_duplicados.csv")
df_charts_raw <- read.csv("data/df_charts_sin_duplicados.csv")
df_audio_features_raw <- read.csv("data/audio_features_plano_sin_duplicados.csv")
df_lyrics <- read.csv("data/df_lyrics.csv")
# DF listo para el join con chrats
df_audio_features <- df_audio_features_raw %>%
group_by(track_name, external_urls_spotify) %>%
mutate(artist_all = paste(artist_name, collapse = ",|,")) %>%
ungroup() %>%
mutate(artist_key = sub(",|,.*", "", artist_all)) %>%
dplyr::select(artist_name, artist_all, artist_key, everything(.)) %>%
distinct(artist_key, external_urls_spotify, .keep_all = T) %>%
as.data.frame()
cant_marketscontar_market <- function(x){
q <- length(unlist(strsplit(x, split = ",")))
return (q)
}
df_audio_features$cant_markets <- sapply(df_audio_features[,"markets_concat"], contar_market)
#features var continuos
features_continuas <- c('acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'cant_markets')
#features var_ categóricas
features_categoricas <- c('explicit', 'key_name', 'mode_name', "key_mode")
fit <- lm(loudness~energy+acousticness, data=df_audio_features)
modelo <- fit$coefficients
df_audio_features$loudness_reg_imp <- df_audio_features$loudness
X <- df_audio_features[df_audio_features$loudness>0, c('energy', "acousticness")]
df_audio_features$loudness_reg_imp[df_audio_features$loudness>0] <- modelo[1]+modelo[2]*X[,1]+modelo[3]*X[,2]
summary(df_audio_features[,c("loudness", "loudness_reg_imp")])
summary(fit)
#graficos con loudness con imputacion
par(mfrow = c(2,1))
hist(df_audio_features[,'loudness_reg_imp'], main='loudness', xlab="")
#hist(sqrt(df_audio_features[,'loudness_reg_imp']), main= 'loudness_sqrt', xlab="")
boxplot(df_audio_features[,'loudness_reg_imp'], horizontal = T)
#boxplot(sqrt(df_audio_features[,'loudness_reg_imp']), horizontal = T)
#metrica de popularidad
df_charts <- df_charts_raw %>%
group_by(Artist, Track_Name) %>%
dplyr:: summarise(semanas_sum = as.double(n()),
streams_sum = (sum(Streams, na.rm = T)/10^6 ),
streams_min = (min(Streams)/10^6 ),
streams_max = (max(Streams)/10^6 ),
streams_avg = (mean(Streams)/10^6),
position_avg = mean(Position, na.rm = T),
position_median = median(Position, na.rm = T),
position_min = min(Position),
position_max = max(Position)) %>%
ungroup() %>%
mutate(popularidad = as.numeric(streams_sum*semanas_sum/position_avg) )
library(reshape2)
ggplot(melt(df_charts[,3:ncol(df_charts)]), aes(value))+
geom_histogram()+
facet_wrap(~variable , scales = "free")
audio_features Y charts#Armamos un join para tener una tabla de charts con las caracteristicas de las canciones
# deberian quedar 22993 filas completas
join_audio_charts <- df_audio_features %>%
select("artist_name","artist_all","artist_key",
"track_name", "external_urls_spotify", "album_name", "album_release_year",
all_of(features_continuas), all_of(features_categoricas)) %>%
right_join( df_charts,# %>%
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist"))
#HAY CHARTS QUE NO TIENEN FEATURES. HAY QUE TENERLO EN CUENTA PARA EL ANÁLISIS
library(mice)
md.pattern(join_audio_charts, rotate.names = TRUE)
##histograma de las variables continuas de audio_features
for (i in features_continuas){
hist(df_audio_features[,i], main = paste("Histograma de", i, "(all data)"), xlab = i)
abline(v = mean(df_audio_features[,i], na.rm = TRUE) , col="red")
abline(v = median(df_audio_features[,i], na.rm = TRUE) , col="blue")
legend("topright", legend = c("Media", "Mediana"), col=c("red", "blue"), lty =1)
}
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
##histograma de las variables continuas de charts
for (i in c(features_continuas)){
hist(join_audio_charts[,i], main = paste("Histograma de", i, "(charts)"), xlab = i)
abline(v = mean(join_audio_charts[,i], na.rm = TRUE) , col="red")
abline(v = median(join_audio_charts[,i], na.rm = TRUE) , col="blue")
}
#divido features de charts según su distribución
audio_charts_continuas_media <- c('duration_ms', 'valence')
audio_charts_continuas_mediana <- c('danceability', 'acousticness', 'tempo', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets', "Streams")
##medidas resumen y barplots de las variables categoricas audio_features
for(i in features_categoricas){
barplot(sort(table(df_audio_features[,i]),decreasing = T), las=2,
main = paste("Barplot de", i, "(all data)"))
# pie(table(df_features_categoricos[,i]))
}
##medidas resumen y barplots de las variables categoricas join_audio_charts
for(i in features_categoricas){
barplot(table(join_audio_charts[,i]), las=2,
main = paste("Barplot de", i, "(charts)")
)
# pie(table(df_features_categoricos[,i]))
}
hist(df_audio_features$instrumentalness)
par(mfrow=c(4,3))
for (feature in features_continuas){
boxplot(df_audio_features[,feature], las=2, horizontal=T, main=feature)
}
Con excepción de valence el resto de las features poseían cierto sesgo. Se decidió transformar las variables que mayor sesgo poseían: duration_ms, instrumentalness, liveness, speechiness como método de corregir la distribución y achicar la cantidad de outliers. La variable loudness_reg_imp no fue modificada debido a que al ser negativa
# "danceability,tempo,valence,acousticness,duration_ms,energy,instrumentalness,liveness,speechiness,cant_markets"
#sesgos d las variables
sort(apply(df_audio_features[,features_continuas], MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
variables_sesgo <- unlist(strsplit("acousticness,duration_ms,instrumentalness,liveness,speechiness,cant_markets,energy", ","))
df_sesgadas <- df_audio_features[,variables_sesgo]
logaritmo_ajustado = function(x,delta){
if (x==0.0){
return(log(0.00+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
df_sesgadas_log_adjust <- data.frame(apply(df_audio_features[,variables_sesgo], MARGIN = c(1,2),
function(x) logaritmo_ajustado(x,delta)))
ggplot(reshape::melt(df_sesgadas), aes(value))+
geom_histogram()+
facet_wrap(~variable)
ggplot(reshape::melt(df_sesgadas_log_adjust), aes(value))+
geom_histogram()+
facet_wrap(~variable)
####################################################
# names(df_sesgadas_log_adjust) <- paste(names(df_sesgadas), "_log", sep="")
# names(df_sesgadas_log_adjust) <- names(df_sesgadas)
# df_datos <- cbind(df_sesgadas, df_sesgadas_log_adjust)
a <- df_sesgadas
b <- df_sesgadas_log_adjust
names(b) <- paste(names(df_sesgadas), "_log", sep="")
merged <- cbind(a,b)
merged <- merged[, order(names(merged))]
round((
apply(
merged, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)}
)
),2)
#histogramas de vbles transformadas con logaritmo
# transformacion <- c('instrumentalness','loudness','liveness','speechiness', 'duration_ms')
par(mfrow=c(3,5))
for (feature in variables_sesgo){
hist(df_audio_features[,feature], main=feature)
}
for (feature in variables_sesgo){
hist(unlist(lapply(df_audio_features[,feature], function(x) logaritmo_ajustado(x,delta))), main=paste(feature,"log", sep="_"))
}
inv_sqrt_ajustada = function(x, delta){
if (x==0.0){
return(1/sqrt(x+0.000001))
}else{
return(1/sqrt(x))
}
}
delta <- sqrt(10^(-6))
par(mfrow=c(3,5))
for (feature in variables_sesgo){
hist(df_audio_features[,feature], main=feature)
}
for (feature in variables_sesgo){
hist(unlist(lapply(df_audio_features[,feature], function(x) inv_sqrt_ajustada(x,delta))), main=paste(feature,"inv_sqt", sep="_"))
}
df_sesgadas_inv_sqrt <- data.frame(apply(df_audio_features[,variables_sesgo], MARGIN = c(1,2),
function(x) inv_sqrt_ajustada(x,delta)))
#nuevos sesgos con inversa raiz cuadrada
a <- df_sesgadas
b <- df_sesgadas_inv_sqrt
names(b) <- paste(names(df_sesgadas), "_invsqrt", sep="")
merged <- cbind(a,b)
merged <- merged[, order(names(merged))]
round((
apply(
merged, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)}
)
),2)
b <- df_sesgadas_inv_sqrt[,c("cant_markets", "energy", "instrumentalness", "liveness" , "speechiness")]
names(b) <- paste(names(b), "_invsqrt", sep="")
par(mfrow=c(1,2))
ggplot(data=reshape::melt(b), aes(value))+
geom_histogram(bins = 5)+facet_wrap(~variable, scales = "free")
ggplot(data=reshape::melt(a), aes(value))+
geom_histogram(bins = 5)+facet_wrap(~variable, scales = "free")
df_ses_chart <- df_charts %>%
ungroup() %>%
select( "Track_Name", "Artist",
"streams_avg", "position_avg",
"semanas_sum", "popularidad")
df_ses_chart_log <- data.frame(apply(df_ses_chart[,3:6], MARGIN = c(1,2),
function(x) logaritmo_ajustado(x,delta)))
cat("originales\n")
#sesgos d las variables
(apply(df_ses_chart[,3:6], MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
cat("\ntransformadas\n")
(apply(df_ses_chart_log, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
ggplot(reshape2::melt(df_ses_chart[,3:6]), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")+
labs(title = "Histogramas originales")
ggplot(reshape::melt(df_ses_chart_log), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")+
labs(title = "Histogramas transformados")
df_ses_chart_inv_sqrt <- data.frame(apply(df_ses_chart[,3:6], MARGIN = c(1,2),
function(x) inv_sqrt_ajustada(x,delta)))
cat("\ntransformadas\n")
(apply(df_ses_chart_inv_sqrt, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
charts_sin_sesgo <- cbind(df_ses_chart[,-c(3,5)],
"semanas_sum" =df_ses_chart_log[,c("semanas_sum")],
"streams_avg" = df_ses_chart_inv_sqrt[,c("streams_avg")])
#join entre variables transformadas y resto features
audio_sin_sesgo <- df_audio_features %>%
select("artist_name", "artist_key",
"track_name",
all_of(features_continuas), all_of(features_categoricas)) %>%
select(!variables_sesgo)
audio_ft_to_discretize <- cbind(audio_sin_sesgo, df_sesgadas_log_adjust[,c("acousticness", "duration_ms")],
df_sesgadas_inv_sqrt[,c("instrumentalness", "cant_markets",
"liveness" , "speechiness")]) %>%
select(-c( "key_name", "explicit", "mode_name" ,"key_mode")) %>%
group_by(artist_name, artist_key, track_name) %>%
distinct(.keep_all = T)
#normalizacion de todas las vbles
scale_vble <- function(x){
(x - mean(x, na.rm = T))/sd(x, na.rm = T)
}
audio_ft_to_discretize_norm <- audio_ft_to_discretize %>%
# mutate_all(scale)
mutate_all(scale_vble)#
ggplot(data= melt(audio_ft_to_discretize, id.vars = c("artist_name", "artist_key", "track_name")), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")
ggplot(data= melt(audio_ft_to_discretize_norm, id.vars = c("artist_name", "artist_key", "track_name")), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")
charts_sin_sesgo_norm <- charts_sin_sesgo %>%
rename(track_name = Track_Name, artist = Artist) %>%
ungroup() %>%
select(-c(track_name, artist)) %>%
mutate_all(function(x) {scale_vble(x)})
# mutate_at(., .vars = vars(-group_cols("Track_Name", "Artist")), function(x) {scale_vble(x)})
charts_sin_sesgo_norm <- cbind(charts_sin_sesgo[, c("Artist", "Track_Name")], charts_sin_sesgo_norm )
#funciones
cut_median <- function(x){
cut(x,
breaks = c(0,median(x)/2,
median(x),
quantile(x,probs = 0.75),
Inf),
include.lowest = T,
labels=c("Baja","Media","Alta", "Muy alta") )
}
cut_binary <- function(x){
cut(x,
breaks = c(min(x,na.rm = T),
median(x),
Inf),
include.lowest = T,
labels=c("Baja","Alta"))
}
cut_cuantile <- function(x){
cut(x,
breaks = quantile(x),
include.lowest = T,
labels=c("Baja","Media","Alta", "Muy alta") )
}
#select data
df_audio_ft_select <-df_audio_features %>%
select(artist_name, artist_key, track_name, features_continuas, -loudness,loudness = loudness_reg_imp ) %>%
group_by(track_name, artist_key, artist_name) %>%
summarise_all(function(x) mean(x, na.rm = T)) %>%
left_join(df_audio_features %>%
select(artist_name, artist_key, track_name, explicit, mode_name) %>%
distinct(artist_name, artist_key, track_name, .keep_all = T) )
#creacion columnas
#by median
df_audio_ft_select$acousticness_cat <- cut_median(df_audio_ft_select$acousticness)
df_audio_ft_select$duration_ms_cat <- cut_median(df_audio_ft_select$duration_ms)
df_audio_ft_select$liveness_cat <-cut_median(df_audio_ft_select$liveness)
df_audio_ft_select$speechiness_cat <- cut_median(df_audio_ft_select$speechiness)
#by binary
df_audio_ft_select$instrumentalness_cat <- cut_binary(df_audio_ft_select$instrumentalness)
df_audio_ft_select$loudness_cat <- cut_binary(df_audio_ft_select$loudness)
#by cunatile
df_audio_ft_select$tempo_cat <- cut_cuantile(df_audio_ft_select$tempo)
df_audio_ft_select$valence_cat <- cut_cuantile(df_audio_ft_select$valence)
df_audio_ft_select$danceability_cat <- cut_cuantile(df_audio_ft_select$danceability)
#cant markets
df_audio_ft_select$cant_markets_cat <- cut(df_audio_ft_select$cant_markets,
breaks = c(0, 70, 110, Inf),
labels = c("Baja","Media","Alta"))
#true categories vbles
df_audio_ft_select$explicit_cat <- ifelse(df_audio_ft_select$explicit ==TRUE, "Si", "No")
df_audio_ft_select$mode_name_cat <- df_audio_ft_select$mode_name
# filtro
df_audio_ft_cat <- df_audio_ft_select %>%
select( artist_name, artist_key, track_name, contains("_cat") )
x <- df_audio_ft_cat <- df_audio_ft_select %>%
select( contains("_cat") )
for(i in names(x) ){
barplot(table(x[,i]), las=2,
main = paste("Barplot de", i, "(charts)") )
# cat(table( x[,i] ))
}
df_charts_sel <- df_charts %>%
ungroup() %>%
select( "Track_Name", "Artist",
"streams_avg", "position_median",
"semanas_sum", "popularidad")
ggplot(reshape2::melt(df_charts_sel[,3:6]), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")+
labs(title = "Histogramas originales")
df_charts_sel$streams_avg_cat = cut_cuantile(df_charts_sel$streams_avg)
df_charts_sel$popularidad_cat = cut_cuantile(df_charts_sel$popularidad)
df_charts_sel$position_median_cat = cut(df_charts_sel$position_median,
breaks = quantile(df_charts_sel$position_median),
include.lowest = T,
labels=c( "Muy alta", "Alta", "Media", "Baja"))
df_charts_sel$semanas_sum_cat = cut(df_charts_sel$semanas_sum,
breaks = c(0, 2,4,13, Inf),
include.lowest = T,
labels=c("Baja","Media","Alta", "Muy alta"))
# filtro
df_charts_cat <- df_charts_sel %>%
select( Artist, Track_Name, contains("_cat") )
#prueba fallida
audio_ft_to_discretize_norm %>%
ungroup() %>%
group_by(artist_name, track_name, artist_key) %>%
mutate_all( function(x){ cut(x, quantile(x, probs = seq(0, 1, 0.25
), na.rm = T,
# ) , labels = c("Muy Alta","Alta" ,"Media", "Baja"
# ) , labels = c("Muy Alta","Alta" ,"Media", "Baja"
) , labels = c("Muy Alta" ,"Alta", "Media", "Baja"
)
) }
)
# Discretizaciones basadas en Quantile (FUNCIONA)
library(R.oo)
cols <- names(audio_ft_to_discretize_norm)[!names(audio_ft_to_discretize_norm) %in% c("artist_name","artist_key", "track_name")]
df_num = audio_ft_to_discretize_norm[,cols]
df_cat <- audio_ft_to_discretize_norm[, c("artist_name","artist_key", "track_name")]
for(i in seq_along(df_num) ){
breaks =unique(quantile(df_num[[i]], probs = seq(0, 1, 0.33) ,na.rm = T))
# breaks =unique(quantile(df_num[[i]], probs = seq(0, 1, 0.25)))
# breaks =quantile(df_num[[i]] , names = FALSE)
label = intToChar(65:(63+length(breaks)))
# label = c("Muy Alta", "Alta", "Media", "Baja")
x <- cut(df_num[[i]], breaks=breaks,
labels = label )
# labels = c("Muy Alta","Alta" ,"Media", "Baja") )
df_cat <- cbind(df_cat, x )
}
# seteo de nombres
names(df_cat) <- names(audio_ft_to_discretize_norm)
# audio_ft_to_discretize_norm %>% filter(track_name =="Juice WRLD Speaks From Heaven - Outro")
# df_cat %>% filter(track_name =="Baby")
# df_cat %>% filter(track_name =="goodbye")
# df_cat %>% filter(track_name =="DÁKITI")
# audio_ft_to_discretize_norm %>% filter(track_name =="Rule The World (feat. Ariana Grande)")
# df_charts %>% filter(Track_Name =="Rule The World (feat. Ariana Grande)")
audio_ft_to_discretize_norm %>%
group_by(artist_name, track_name) %>%
arrange(streams_avg)
# slice(which.max(danceability))
# filter(danceability == max(danceability, na.rm =T))
#analisis de missing values
mice::md.pattern(df_cat, plot = T, rotate.names = T)
sum(!complete.cases(df_cat))
summary(VIM::aggr(df_cat, sortVar= T,plot=F))
# JOIN FINAL
#falta transformar df_charts !!!
join_audio_charts <- audio_ft_to_discretize_norm %>%
right_join( charts_sin_sesgo_norm ,
by = c("track_name" = "Track_Name",
"artist_key" ="Artist")) %>%
select(-c("artist_key", "key_name", "explicit", "mode_name" ,"key_mode")) %>%
group_by(artist_name, track_name) %>%
summarise_all( function(x) mean(x, na.rm= T)) %>%
ungroup() %>%
filter(!is.na(artist_name)) %>%
group_by(artist_name, track_name)# %>%
# mutate_all(function(x) scales::rescale(x, to=c(0,1)))
# mutate_all(function(x) (x-min(x, na.rm = T)) / (max(x, na.rm = T)-min(x, na.rm = T)) )
################################
## FILTRAMOS OUTLIERS POR Z-SCORE para 'danceability', 'tempo', 'valence'
##############################
#z-score para variables que tienden a la normal
#filtro features numericos
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
df_audio_features_zscore_media <- df_audio_features[,features_continuas_media]
#normalizo z score con las variables que tienden a la normal
zscore_cols <- c()
for(col in names(df_audio_features_zscore_media)){
name_col <- paste('zscore_',col, sep = "")
zscore_cols <- append(zscore_cols, name_col)
media <- mean(df_audio_features_zscore_media[,col])
stdv <- sd(df_audio_features_zscore_media[,col])
df_audio_features_zscore_media[,name_col] <- (df_audio_features_zscore_media[,col] - media)/stdv
}
par(mfrow=c(1,length(zscore_cols)))
lapply(zscore_cols, function(col) boxplot(df_audio_features_zscore_media[,col],xlab=col))
Danceability
#variable: danceability
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_danceability> umbral_zscore) | (df_audio_features_zscore_media$zscore_danceability< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, danceability ) %>%
arrange(-danceability)
Tempo
#variable: Tempo
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_tempo> umbral_zscore) | (df_audio_features_zscore_media$zscore_tempo< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, tempo ) %>%
arrange(-tempo)
Valence
#variable: valence
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_valence> umbral_zscore) | (df_audio_features_zscore_media$zscore_valence< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, valence ) %>%
arrange(-valence)
################################
## FILTRAMOS OUTLIERS POR Z-SCORE MODIFICADO para 'acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets'
##############################
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
df_audio_features_zscore_mediana <- df_audio_features[,features_continuas_mediana]
zscoremodif_cols <- c()
for(col in names(df_audio_features_zscore_mediana)){
name_col <- paste('zscoremodif_',col, sep = "")
zscoremodif_cols <- append(zscoremodif_cols, name_col)
med = median(df_audio_features_zscore_mediana[,col], na.rm = T)
MAD = median(abs(df_audio_features_zscore_mediana[,col] - med), na.rm = T)
df_audio_features_zscore_mediana[, name_col] <- 0.6745 * (df_audio_features_zscore_mediana[,col] - med) / MAD
}
par(mfrow=c(4,2))
lapply(zscoremodif_cols, function(col) boxplot(df_audio_features_zscore_mediana[,col],xlab=col, horizontal = T))
Revisión Variable Instrumentalness
instrumentalness <- c("instrumentalness", "zscoremodif_instrumentalness")
x <- df_audio_features$instrumentalness
n_interv <- 10
intervalos <- round(seq(0,max(x),by=(max(x)-min(x))/n_interv),2)
labs <- c()
for (i in 1:n_interv){
lab <- paste(intervalos[i],intervalos[i+1], sep='\n')
labs <- append(labs, lab)
}
bins <- cut(x, n_interv, include.lowest = TRUE, labels = labs)
barplot(table(bins))
Hacemos K-means para poder discretizar la variable.
sse <- c()
for (k in 2:6){
clusters <- kmeans(df_audio_features$instrumentalness,centers = k, iter.max = 10, nstart = k)
sse <- append(sse, clusters$tot.withinss)
}
plot(2:6,sse, type = 'l', xlab='Cantidad de Clusters', ylab='Suma Error Cuadrático')
#k=3
clusters3 <- kmeans(df_audio_features$instrumentalness,centers = 3, iter.max = 10, nstart = 3)
df_audio_features$clusters <- factor(clusters3$cluster)
lev <- levels(df_audio_features$clusters)
labs <- c()
for (i in lev){
min <- min(df_audio_features$instrumentalness[df_audio_features$clusters==i])
max <- max(df_audio_features$instrumentalness[df_audio_features$clusters==i])
lab <- paste(min,max, sep=' - ')
labs <- append(labs, lab)
}
labs
# barplot(table(factor(clusters3$cluster)), labels = labs)
#prueba igal de transformacion y test de normalidad
join_audio_charts[1:5,"acousticness"]^2
library(nortest)
log10(df_chart_w_lyrics$acousticness)
for (i in features_continuas){
x <- log10(df_chart_w_lyrics[,i])
x <- shapiro.test(x)
z <- x$p.value
print(z)
}
|# LYRICS
# tabla contingencia de idiomas
# idiomas = textcat(df_lyrics_unicas$lyrics)
# sort(table(idiomas), decreasing = T)
#El word_count_df se realiza con Python (ejecutar en_lyrics_word_coun)
word_counts_df <- read.csv("data/en_lyrics_word_count.csv")
x <- 1:nrow(word_counts_df)
f <- word_counts_df$counts
plot(x,f, type="l",
log = 'xy',
col="blue",
lwd=3,
ylab = "Frecuencia Absoluta",
xlab = "",
main="Frecuencia de Palabras - Ley de Zipf")
library(stopwords)
Attaching package: 㤼㸱stopwords㤼㸲
The following object is masked from 㤼㸱package:tm㤼㸲:
stopwords
threshold <- 1000
x <- as.vector(word_counts_df[word_counts_df$counts>threshold,]$word)
not_remove_list = c("bitch","fuck","love","baby","nigga","feel", "girl", "shit")
top_stopwords <- setdiff(x, not_remove_list)
smart_stopwords <- stopwords("en", source = "smart")
my_eng_stopwords <- unique(append(smart_stopwords, top_stopwords))
my_spa_stopwords <- unique(text_cleaning(stopwords("es", source = "stopwords-iso"), language = "es"))
del_stopwords = function(txt, stopword_list){
remove_regex = paste("\\b(", paste(stopword_list, collapse = "|"),")\\W", sep="")
txt = gsub(remove_regex, " ", txt)
txt = gsub("\\W+\\b", " ", txt)
return(txt)
}
spa_lyrics$sin_stopwords <- del_stopwords(spa_lyrics$lyrics_cleaning, my_spa_stopwords)
en_lyrics$sin_stopwords <- del_stopwords(en_lyrics$lyrics_cleaning, my_eng_stopwords)
#Diccionario español
malas_palabras_1 <- read_csv("data/malas_palabras.txt",
col_names = FALSE)
-- Column specification ---------------------------------------------------------------------------
cols(
X1 = col_character()
)
malas_palabras_2 <- read_csv("data/malas_palabras_translate.txt",
col_names = FALSE)
-- Column specification ---------------------------------------------------------------------------
cols(
X1 = col_character()
)
malas_palabras_3 <- read_csv("data/malas_palabras_wiki.txt",
col_names = FALSE) %>%
select(X1)
-- Column specification ---------------------------------------------------------------------------
cols(
X1 = col_character(),
X2 = col_character(),
X3 = col_character(),
X4 = col_character(),
X5 = col_character(),
X6 = col_character()
)
52 parsing failures.
row col expected actual file
2 -- 6 columns 3 columns 'data/malas_palabras_wiki.txt'
3 -- 6 columns 2 columns 'data/malas_palabras_wiki.txt'
4 -- 6 columns 3 columns 'data/malas_palabras_wiki.txt'
5 -- 6 columns 4 columns 'data/malas_palabras_wiki.txt'
6 -- 6 columns 2 columns 'data/malas_palabras_wiki.txt'
... ... ......... ......... ..............................
See problems(...) for more details.
malas_palabras_4 <- read_csv("data/palabras_profanas_es.txt",
col_names = FALSE)
-- Column specification ---------------------------------------------------------------------------
cols(
X1 = col_character()
)
malas_palabras <- rbind(malas_palabras_1, malas_palabras_2,
malas_palabras_3, malas_palabras_4)
#Hacer unique para eliminar las repetidas
malas_palabras$limpias = text_cleaning(malas_palabras$X1, language="es")
#Genero lista de malas palabras
racist_words <- unique(tolower(lexicon::profanity_racist))
biglou <- read.csv("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt", header=FALSE, col.names = c("words"))
corpus_esp
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 0
Content: documents: 129
df_ly_feat_esp <- cbind(spa_lyrics[, 1:19], df_tm_esp)
nrow(spa_lyrics)
[1] 129
nrow(df_ly_feat_esp)
[1] 129
na.omit(df_ly_feat_esp)
mice::md.pattern(df_ly_feat_esp[, 153:ncol(df_ly_feat_esp)], rotate.names = T)
/\ /\
{ `---' }
{ O O }
==> V <== No need for mice. This data set is completely observed.
\ \|/ /
`-----'
uah uoh vamo vas ven veo verte vida viene vuelvo woh work wuh yah yeah yeh you
129 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
df_ly_feat_ok_esp <- df_ly_feat_esp[, filter]
df_ly_feat_ok_esp$id = 1:nrow(df_ly_feat_ok_esp)
df_melt_esp <- reshape2::melt(data = df_ly_feat_ok_esp[,4:ncol(df_ly_feat_ok_esp)], id.vars = c("id")) %>%
arrange(id)
attributes are not identical across measure variables; they will be dropped
df_melt_esp <- df_melt_esp[df_melt_esp$value != 0,]
df_melt_esp_txt <- df_melt_esp[df_melt_esp$value == 1,]
df_melt_esp_cat <- df_melt_esp[df_melt_esp$value != 1,]
df_melt_esp_cat$variable = paste0(df_melt_esp_cat$variable, "=", as.character(df_melt_esp_cat$value))
#denomino a los términos profanos
df_melt_esp_txt <- df_melt_esp_txt %>%
mutate(variable = case_when(as.character(variable) %in% malas_palabras$limpias ~
paste0("PROFANE_", as.character(variable)),
T ~ paste0("TERM_", as.character(variable))
)
)
df_melt_txt_to_ruls_esp <- rbind(df_melt_esp_txt, df_melt_esp_cat)
df_melt_txt_to_ruls_esp <- na.omit(df_melt_txt_to_ruls_esp[,-c(3)])
names(df_melt_txt_to_ruls_esp ) = c("TID", "item")
reglas <- apriori(lyrics_trans, parameter = list(support=0.01,
confidence = 0.1, target = "rules" ))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 8
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[206 item(s), 852 transaction(s)] done [0.00s].
sorting and recoding items ... [203 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7
Mining stopped (time limit reached). Only patterns up to a length of 7 returned!
done [7.40s].
writing ...
df_lyrics_unicas <- df_lyrics %>% distinct(artist_name, track_name, lyrics)
nrow(df_lyrics_unicas)
df_chart_w_lyrics <- merge(join_audio_charts, df_lyrics_unicas, by.x = c("artist_name","track_name"), by.y= c("artist_name","track_name"), all.x=TRUE, all.y = FALSE)
df_chart_w_lyrics <- df_chart_w_lyrics[!is.na(df_chart_w_lyrics$lyrics),]
bad_words <- c()
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_zac_anger)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_alvarez)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_arr_bad)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_racist)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_banned)))
bad_words <- unique(bad_words)
contar_bad_words <- function(x){
x <- profanity(x,profanity_list = bad_words)
q <- sum(x$profanity_count)
return (q)
}
df_chart_w_lyrics$cant_bad_words <- sapply(df_chart_w_lyrics[,"lyrics"], contar_bad_words)
df_chart_w_lyrics_only_explicit <- df_chart_w_lyrics[df_chart_w_lyrics$explicit==TRUE & df_chart_w_lyrics$cant_bad_words > 0, ]
hist(df_chart_w_lyrics_only_explicit$cant_bad_words)
#creo vars categóricas
df_chart_w_lyrics_only_explicit$nivel_puteada <- cut(df_chart_w_lyrics_only_explicit$cant_bad_words, breaks = c(0,10,20,50,Inf), labels=c("bajo","poco","alto","muy_alto"))
df_chart_w_lyrics_only_explicit$nivel_ranking <- cut(df_chart_w_lyrics_only_explicit$position_avg, breaks = c(1,100,Inf), labels=c("1a100","100a200"))
df_chart_w_lyrics_only_explicit$nivel_popularidad <- cut(sqrt(df_chart_w_lyrics_only_explicit$cant_bad_words), breaks = c(0,10,20,50,Inf), labels=c("bajo","poco","alto","muy_alto"))
transactions <- as(as.data.frame(apply(df_chart_w_lyrics_only_explicit, 2, as.factor)), "transactions")
rules = apriori(transactions, parameter=list(target="rules", confidence=0.25, support=0.1))
rules.sub <- subset(rules, subset = lhs %pin% "nivel_puteada" & rhs %pin% "nivel_ranking")
inspect(head(sort(rules.sub, by = "lift", decreasing = TRUE),10))
# discretizacion continuas y seleccion de variables
# identificar palabras explít